## estimates SATE, calculates confidences intervals and p-values
## for the application outcome. p-values are randomization p-values for
## the null of no effect whatsoever. estimates are all intent-to-treat
## estimates.
##
## table of results has the following columns (in order):
##
## subgroup    estimate    95% CI    p-value    
##
##
## Cait Unkovic, Maya Sen, Kevin Quinn
##
## 12/9/2014
##

set.seed(553920)

nperm <- 5001

## utility functions

## stratum specific estimates of ATE
ATE.s <- function(data, outcome="applied"){
  sex.vals <- unique(data$sex)
  univ.vals <- unique(data$univ)

  univ <- rep(NA, length(univ.vals)*2)
  sex <- rep(NA, length(univ.vals)*2)
  n.units <- rep(NA, length(univ.vals)*2)
  ATE.hat <- rep(NA, length(univ.vals)*2)

  counter <- 1
  for (u in univ.vals){
    for (s in sex.vals){
      data.sub <- data[data$univ == u & data$sex == s,]
      treated.data <- na.omit(data.sub[data.sub$treat==1,outcome])
      control.data <- na.omit(data.sub[data.sub$treat==0,outcome])
      n.treat <- length(treated.data)
      n.control <- length(control.data)
            
      univ[counter] <- u
      sex[counter] <- s
      n.units[counter] <- n.treat + n.control
      ATE.hat[counter] <- mean(treated.data) - mean(control.data)
      
      counter <- counter + 1
      }
  }
  
  return(data.frame(univ, sex, n.units, ATE.hat, stringsAsFactors=FALSE))
}


## ATE for full sample or subset of sample
##
## subset should either be NULL which correspons to the full sample or
##        a logical vector with TRUE values corresponding to university-sex
##        combinations to include the average and FALSE values corresponding
##        to university-sex combinations to exclude from the average.
ATE <- function(data, outcome="applied", subset=NULL){
  holder <- ATE.s(data, outcome)
  if (is.null(subset)){
    subset=rep(TRUE, nrow(holder))
  }
  ## error checking for subset
  if (length(subset) != nrow(holder)){
    stop("length of subset not equal to number of university-sex combinations")
  }
  if (!is.logical(subset)){
    stop("subset is not a logical vector")
  }
  holder <- holder[subset,]
  weights <- holder$n.units / sum(holder$n.units)
  estimate <- sum(holder$ATE.hat*weights)
  return(estimate)
}







## p-value for the test of null of no individual effect whatsoever 
##
## subset should either be NULL which correspons to the full sample or
##        a logical vector with TRUE values corresponding to university-sex
##        combinations to include the average and FALSE values corresponding
##        to university-sex combinations to exclude from the average.
get.perm.pval <- function(data, outcome="applied", M=5001, subset=NULL){
  holder <- ATE.s(data, outcome)
  if (is.null(subset)){
    subset=rep(TRUE, nrow(holder))
  }
  ## error checking for subset
  if (length(subset) != nrow(holder)){
    stop("length of subset not equal to number of university-sex combinations")
  }
  if (!is.logical(subset)){
    stop("subset is not a logical vector")
  }
  
  ATE.s.obs <- holder[subset,]
  ATE.obs <- ATE(data, outcome, subset)

  ATE.sim <- rep(NA, M)
  ATE.s.sim <- matrix(NA, M, nrow(ATE.s.obs))

  sex.vals <- unique(ATE.s.obs$sex)
  univ.vals <- unique(ATE.s.obs$univ)
  for (iter in 1:M){
    counter <- 1
    n.obs <- rep(NA, nrow(ATE.s.obs))
    for (u in univ.vals){
      for (s in sex.vals){
        data.sub <- data[data$univ == u & data$sex == s,] 
        n.treat <- sum(data.sub$treat==1)
        n.control <- sum(data.sub$treat==0)
        data.sub$treat <- sample(c(rep(1, n.treat), rep(0, n.control)),
                                 replace=FALSE)
        
        treated.data <- na.omit(data.sub[data.sub$treat==1,outcome])
        control.data <- na.omit(data.sub[data.sub$treat==0,outcome])
        n.treat <- length(treated.data)
        n.control <- length(control.data)
        n.obs[counter] <- n.treat + n.control

        ATE.s.sim[iter, counter] <- mean(treated.data) - mean(control.data)
        
        counter <- counter + 1
      }
    }
    weights <- n.obs / sum(n.obs)
    ATE.sim[iter] <- sum(ATE.s.sim[iter,]*weights)
  }

  pvals.s <- rep(NA, nrow(ATE.s.obs))
  for (i in 1:nrow(ATE.s.obs)){
    pvals.s[i] <- mean(abs(ATE.s.sim[,i]) >= abs(ATE.s.obs$ATE.hat[i]) )
  }
  pval.overall <- mean(abs(ATE.sim) >= abs(ATE.obs) )

  output1 <- data.frame("ATE", sum(n.obs), ATE.obs, pval.overall)
  return(output1)
}



## tabulate data (used to format data for robins.ci() )
tabulate.data <- function(data, outcome="applied"){
  sex.vals <- unique(data$sex)
  univ.vals <- unique(data$univ)

  univ <- rep(NA, length(univ.vals)*2)
  sex <- rep(NA, length(univ.vals)*2)
  X0Y0 <- rep(NA, length(univ.vals)*2)
  X0Y1 <- rep(NA, length(univ.vals)*2)
  X1Y0 <- rep(NA, length(univ.vals)*2)
  X1Y1 <- rep(NA, length(univ.vals)*2)
  n.treated <- rep(NA, length(univ.vals)*2)
  n.control <- rep(NA, length(univ.vals)*2)
  n.units <- rep(NA, length(univ.vals)*2)
  
  counter <- 1
  for (u in univ.vals){
    for (s in sex.vals){
      data.sub <- data[data$univ == u & data$sex == s,]
      treated.data <- na.omit(data.sub[data.sub$treat==1,outcome])
      control.data <- na.omit(data.sub[data.sub$treat==0,outcome])

      n.treated[counter] <- length(treated.data)
      n.control[counter] <- length(control.data)            
      n.units[counter] <- n.treated[counter] + n.control[counter]
      univ[counter] <- u
      sex[counter] <- s
      X0Y0[counter] <- sum(control.data==0)
      X0Y1[counter] <- sum(control.data==1)
      X1Y0[counter] <- sum(treated.data==0)
      X1Y1[counter] <- sum(treated.data==1)
      
      counter <- counter + 1
      }
  }
  
  return(data.frame(univ, sex, n.units, n.treated, n.control, X0Y0, X0Y1,
                    X1Y0, X1Y1, stringsAsFactors=FALSE))

}





## computes the CI in Robins 1988 "Confidence Intervals for Causal Parameters"
## ignores the fact that treatment was randomized within blocks but
## does provide valid CIs for SATE
robins.ci <- function(data.tab, subset=NULL, level=0.05){
  library(RI2by2)
  if (is.null(subset)){
    subset=rep(TRUE, nrow(data.tab))
  }
  ## error checking for subset
  if (length(subset) != nrow(data.tab)){
    stop("length of subset not equal to number of university-sex combinations")
  }
  if (!is.logical(subset)){
    stop("subset is not a logical vector")
  }
  data.tab <- data.tab[subset,]


  data <- matrix(NA, 2, 2)
  data[1,1] <- sum(data.tab$X1Y1)
  data[1,2] <- sum(data.tab$X1Y0)
  data[2,1] <- sum(data.tab$X0Y1)
  data[2,2] <- sum(data.tab$X0Y0)
  
  rob.ci <- Robins.CI(data, level=0.05)
  output <- c(level, sum(data), rob.ci$tau.hat, rob.ci$lower, rob.ci$upper)
  names(output) <- c("level", "n", "ATE.hat", "lower", "upper")
  return(output)
}









###################################################################
## read data in and do some processing of the raw data
x <- read.csv("gradcontacts_treatedcontrol_import.csv",
              stringsAsFactors=FALSE)

names(x) <- c("univ", "sex", "ID", "treat", "applied", "accept",
              "app.sex", "app.race")

## replace'2' as an acceptance outcome -- the original dataset had three levels
## but only used two (0 indicates not accepted for any participation,
##                    2 indicates accepted for poster presentation)
x$accept[x$accept==2] <- 1

## replace 9 as an outcome for accept --
## I used 9 to indicate someonw who didn't apply and couldn't
## be accepted when hand merging the datasets
x$accept[x$accept==9] <- 0


## attach the school rank data
univlist <- read.csv("university.list.csv",
                     stringsAsFactors=FALSE)
## clean up a bad entry
univlist <- univlist[1:53,]
## convert rank to numeric
univlist$u.rank <- as.numeric(univlist$u.rank)

x$Dept.Rank <- NULL
for (u in unique(x$univ)){
  x$Dept.Rank[x$univ == u] <- univlist$u.rank[univlist$univ == u]
}











output.table <- data.frame(subgroup=c("full sample", "men", "women",
                             "top 10", "men top 10", "women top 10",
                             "top 11 to 25", "men top 11 to 25",
                             "women top 11 to 25",
                             "top 26 to 50", "men top 26 to 50",
                             "women top 26 to 50"),
                           estimate=NA,
                           lower95=NA, upper95=NA, pval=NA)


## overall effect
output.table$subgroup[1] <- "full sample"
output.table$estimate[1] <- ATE(x, "applied")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data)
output.table$lower95[1] <- ci["lower"]
output.table$upper95[1] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm)
output.table$pval[1] <- pvals["pval.overall"]


## effect for men
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[2] <- "men"
output.table$estimate[2] <- ATE(x, "applied", ATE.s.hat$sex=="Male")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$sex=="Male")
output.table$lower95[2] <- ci["lower"]
output.table$upper95[2] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$sex=="Male")
output.table$pval[2] <- pvals["pval.overall"]



## effect for women
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[3] <- "women"
output.table$estimate[3] <- ATE(x, "applied", ATE.s.hat$sex=="Female")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$sex=="Female")
output.table$lower95[3] <- ci["lower"]
output.table$upper95[3] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$sex=="Female")
output.table$pval[3] <- pvals["pval.overall"]




## effect for both men and women top-10 programs
top10 <- univlist$univ[univlist$u.rank <= 10]
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[4] <- "top 10"
output.table$estimate[4] <- ATE(x, "applied", ATE.s.hat$univ %in% top10)
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top10)
output.table$lower95[4] <- ci["lower"]
output.table$upper95[4] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top10)
output.table$pval[4] <- pvals["pval.overall"]



## effect for men in top-10 programs
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[5] <- "men top 10"
output.table$estimate[5] <- ATE(x, "applied",
                                ATE.s.hat$univ %in% top10 &
                                ATE.s.hat$sex=="Male")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top10 &
                tab.data$sex=="Male")
output.table$lower95[5] <- ci["lower"]
output.table$upper95[5] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top10 &
                       ATE.s.hat$sex=="Male")
output.table$pval[5] <- pvals["pval.overall"]



## effect for women in top-10 programs
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[6] <- "women top 10"
output.table$estimate[6] <- ATE(x, "applied",
                                ATE.s.hat$univ %in% top10 &
                                ATE.s.hat$sex=="Female")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top10 &
                tab.data$sex=="Female")
output.table$lower95[6] <- ci["lower"]
output.table$upper95[6] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top10 &
                       ATE.s.hat$sex=="Female")
output.table$pval[6] <- pvals["pval.overall"]





## effect for both men and women in 11-25 programs
top11.25 <- univlist$univ[univlist$u.rank <= 25 & univlist$u.rank >=11]
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[7] <- "top 11 to 25"
output.table$estimate[7] <- ATE(x, "applied", ATE.s.hat$univ %in% top11.25)
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top11.25)
output.table$lower95[7] <- ci["lower"]
output.table$upper95[7] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top11.25)
output.table$pval[7] <- pvals["pval.overall"]




## effect for men in 11-25 programs
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[8] <- "men top 11 to 25"
output.table$estimate[8] <- ATE(x, "applied",
                                ATE.s.hat$univ %in% top11.25 &
                                ATE.s.hat$sex=="Male")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top11.25 &
                tab.data$sex=="Male")
output.table$lower95[8] <- ci["lower"]
output.table$upper95[8] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top11.25 &
                       ATE.s.hat$sex=="Male")
output.table$pval[8] <- pvals["pval.overall"]



## effect for women in 11-25 programs
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[9] <- "women top 11 to 25"
output.table$estimate[9] <- ATE(x, "applied",
                                ATE.s.hat$univ %in% top11.25 &
                                ATE.s.hat$sex=="Female")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top11.25 &
                tab.data$sex=="Female")
output.table$lower95[9] <- ci["lower"]
output.table$upper95[9] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top11.25 &
                       ATE.s.hat$sex=="Female")
output.table$pval[9] <- pvals["pval.overall"]



## effect for both men and women in 26-50 programs
top26.50 <- univlist$univ[univlist$u.rank <= 50 & univlist$u.rank >=26]
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[10] <- "top 26 to 50"
output.table$estimate[10] <- ATE(x, "applied", ATE.s.hat$univ %in% top26.50)
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top26.50)
output.table$lower95[10] <- ci["lower"]
output.table$upper95[10] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top26.50)
output.table$pval[10] <- pvals["pval.overall"]




## effect for men in 26-50 programs
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[11] <- "men top 26 to 50"
output.table$estimate[11] <- ATE(x, "applied",
                                ATE.s.hat$univ %in% top26.50 &
                                ATE.s.hat$sex=="Male")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top26.50 &
                tab.data$sex=="Male")
output.table$lower95[11] <- ci["lower"]
output.table$upper95[11] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top26.50 &
                       ATE.s.hat$sex=="Male")
output.table$pval[11] <- pvals["pval.overall"]



## effect for women in 26-50 programs
ATE.s.hat <- ATE.s(x, "applied")
output.table$subgroup[12] <- "women top 26 to 50"
output.table$estimate[12] <- ATE(x, "applied",
                                ATE.s.hat$univ %in% top26.50 &
                                ATE.s.hat$sex=="Female")
tab.data <- tabulate.data(x, outcome="applied")
ci <- robins.ci(tab.data, subset=tab.data$univ %in% top26.50 &
                tab.data$sex=="Female")
output.table$lower95[12] <- ci["lower"]
output.table$upper95[12] <- ci["upper"]
pvals <- get.perm.pval(x, "applied", M=nperm,
                       subset=ATE.s.hat$univ %in% top26.50 &
                       ATE.s.hat$sex=="Female")
output.table$pval[12] <- pvals["pval.overall"]



save(output.table, file="SATEsummary.Rda")




library(xtable)

print(xtable(output.table, display=c("s", "s", "f", "f", "f", "f"),
             digits=c(10, 10, 3, 3, 3, 3)),
      include.rownames=FALSE)
